Utilizing Machine learning techniques to generate value from a data set of Pulp Sensibility. Using supervised learning algorithms to solve the classification problem of predicting the need of suppliment and also try to find out the root cause of this problem
We have collected 128 patients data with 20 features including the binary target feature whether a patient Need Supliment or not.
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)
df = pd.read_csv("Pulp Sensibility.csv")
df.drop_duplicates(inplace=True)
df.info()
df.head()
Data profile report to explore the contents of the collected data set.
from pandas_profiling import ProfileReport
# raw_data = pd.read_csv("Pulp Sensibility.csv", index_col=False)
# df_profile = df.copy()
# Create and display a report summarizing the data in the hotel bookings data set
profile = ProfileReport(df,
title='Pulp Sensibility Data Profile Report',
html={'style': {
'full_width': True
}})
profile.to_notebook_iframe()
df_cat = df[['Patient', 'Dental History', 'Medical History',
'Percussion ', 'Palpation', 'Mobility',
'PDL involvement', 'Curved Canal ', 'Pulp stone or and Calcification',
'PDL space', 'Lamina Dura',
'Need Supliment']]
df_cat.head()
df_num = df.drop(df_cat.columns, axis=1)
df_num.head()
fig = px.bar(df['Need Supliment'].value_counts(), color =df['Need Supliment'].value_counts().index, text_auto=True, labels = dict(index = "Requirements of Suppliment",value = "Total Number of Patients"))
fig.show()
Observations:
def bar_chart(feature):
suppliment_need = df[df['Need Supliment']==1][feature].value_counts()
no_suppliment = df[df['Need Supliment']==0][feature].value_counts()
df_view = pd.DataFrame([suppliment_need,no_suppliment])
df_view.index = ['Suppliment (Required)','Suppliment (Not Required)']
fig = px.bar(df_view,barmode='group', text_auto=True, labels = dict(index = "Requirements of Suppliment",value = "Total Number of Patients"))
fig.show()
bar_chart('Patient')
Observations:
bar_chart('Medical History')
Observations:
bar_chart('Dental History')
Observations:
bar_chart('Pulp stone or and Calcification')
Observations:
fig = px.histogram(df,x='Age',text_auto=True)
fig.show()
Observations:
fig = px.bar(data_frame=df.groupby(['Need Supliment']).mean().reset_index(),x="Need Supliment",y="Age", text_auto=True)
fig.show()
Observations:
fig = px.bar(data_frame=df.groupby(['Need Supliment']).mean().reset_index(),x="Need Supliment",y="Pain ( Duration) days", text_auto=True)
fig.show()
Observations:
fig = px.bar(data_frame=df.groupby(['Need Supliment']).mean().reset_index(),x="Need Supliment",y='EPT (Duration) before anaesthesia ', text_auto=True)
fig.show()
Observations:
df['Lamina Dura'] = df['Lamina Dura'].map({'0':0,'LOSS':1})
df['Patient'] = df['Patient'].map({'M':1,'F':0})
df.head()
clean_df = df.copy()
cat_features = clean_df.select_dtypes('object').columns
clean_df = pd.concat([clean_df.drop(cat_features, axis = 1),
pd.get_dummies(clean_df[cat_features])], axis = 1)
clean_df.head()
df2 = pd.DataFrame(clean_df.corrwith(df['Need Supliment']).sort_values(ascending=False))
df2 = df2.set_axis(['Correlation Coefficient'], axis=1).reset_index().rename(columns={'index': 'Features'})
df2
transformed_df = clean_df[df2[df2['Correlation Coefficient']>= 0.15]['Features'].to_list()]
transformed_df.head()
# Import the scikit-learn function used to split the data set
from sklearn.model_selection import train_test_split
# Designate the target feature as "y" and the explanatory features as "x"
y = transformed_df['Need Supliment']
x = transformed_df.drop('Need Supliment', axis=1)
# Create the train and test sets for x and y and specify a random_state seed for reproducibility
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size = 0.2, random_state = 87)
GridSearch cross-validation for the logistic regression model is performed below.
# Import the scikit-learn functions and classes necessary to perform cross-validation
from sklearn.pipeline import make_pipeline
from sklearn import preprocessing
from sklearn.model_selection import GridSearchCV
# Import the functions used to save and load a trained model
from joblib import dump, load
# Import the scikit-learn class used to train a logistic regression model
from sklearn.linear_model import LogisticRegression
# Create a pipeline specifying all of the operations to perform when training the model
# In this case, the pipepline consists of z-score standardization and fitting of a logistic regression model
pipeline_lr = make_pipeline(preprocessing.StandardScaler(), LogisticRegression(max_iter = 150))
# Specify the hyperparameters and their corresponding values that are to be used in GridSearch
hyperparameters_lr = { 'logisticregression__C' : [0.05, 0.1, 0.3, 0.5, 0.7, 0.9, 1] }
# Initialize the GridSearch cross-validation object, specifying 10 folds for 10-fold cross-validation and
# "f1" and "accuracy" as the evaluation metrics for cross-validation scoring
logistic_regression = GridSearchCV(pipeline_lr, hyperparameters_lr, cv = 10, scoring = ['f1', 'accuracy'],
refit = 'f1', verbose = 0, n_jobs = -1)
# Train and cross-validate the logistic regression model and ignore the function output
_ = logistic_regression.fit(x_train, y_train)
# Save the model so it can be used again without retraining it
_ = dump(logistic_regression, 'logistic_regression.joblib')
GridSearch cross-validation for the KNN model is performed below.
# Import the scikit-learn class used to implement a KNN classifier
from sklearn.neighbors import KNeighborsClassifier
# Create a pipeline specifying all of the operations to perform when training the model
# In this case, the pipepline consists of z-score standardization and initialization of a KNN classifier
pipeline_knn = make_pipeline(preprocessing.StandardScaler(), KNeighborsClassifier(algorithm = 'ball_tree'))
# Specify the hyperparameters and their corresponding values that are to be used in GridSearch
hyperparameters_knn = { 'kneighborsclassifier__n_neighbors' : [3, 5] }
# Initialize the GridSearch cross-validation object, specifying 5 folds for 5-fold cross-validation and
# "f1" and "accuracy" as the evaluation metrics for cross-validation scoring
knn = GridSearchCV(pipeline_knn, hyperparameters_knn, cv = 5, scoring = ['f1', 'accuracy'],
refit = 'f1', verbose = 0, n_jobs = -1)
# Cross-validate the KNN model and ignore the function output
_ = knn.fit(x_train, y_train)
# Save the model so it can be used again without redefining it
_ = dump(knn, 'knn.joblib')
Having trained and cross-validated the models, I then used the models to make predictions on the test set. I evaluated the performance of the models on the test set using the same F1 and accuracy metrics used to evaluate the models during cross-validation. The performance of the models as indicated by these metrics is displayed below.
# Import the scikit-learn functions used to calculate the F1 score and accuracy on the test set
from sklearn.metrics import f1_score, accuracy_score
# Use the best logistic regression model to make predictions on the test set
y_test_pred_lr = logistic_regression.predict(x_test)
# Display the F1 and ROC AUC on the train and test sets for the logistic regression model
print('Logistic regression F1 (train):',
round(logistic_regression.cv_results_['mean_test_f1'][logistic_regression.best_index_], 3))
print('Logistic regression F1 (test):', round(f1_score(y_test, y_test_pred_lr), 3), '\n')
print('Logistic regression accuracy (train):',
round(logistic_regression.cv_results_['mean_test_accuracy'][logistic_regression.best_index_], 3))
print('Logistic regression accuracy (test):',
round(accuracy_score(y_test, y_test_pred_lr), 3), '\n')
# Use the best KNN model to make predictions on the test set
y_test_pred_knn = knn.predict(x_test)
# Display the F1 and ROC AUC on the train and test sets for the KNN model
print('KNN F1 (train):',
round(knn.cv_results_['mean_test_f1'][knn.best_index_], 3))
print('KNN F1 (test):', round(f1_score(y_test, y_test_pred_knn), 3), '\n')
print('KNN accuracy (train):',
round(knn.cv_results_['mean_test_accuracy'][knn.best_index_], 3))
print('KNN accuracy (test):',
round(accuracy_score(y_test, y_test_pred_knn), 3), '\n')
To objectively determine the degree of bias and variance exhibited by the models, I used the guidelines presented below.
Bias: